import json
import sys

# 逐行读取 JSONL 文件
def read_jsonl(file_path):
    with open(file_path, 'r', encoding='utf-8') as f:
        for line in f:
            try:
                yield json.loads(line.strip())
            except json.JSONDecodeError as e:
                print(f"解析错误: {e} 在行: {line}", file=sys.stderr)

# 统计 label 分布
def count_label_distribution(file_path):
    label_count = {}
    total_count = 0

    for idx, item in enumerate(read_jsonl(file_path)):
        label = item.get("label", "未匹配")  # 取 label，如果不存在则标记为 "未匹配"
        label_count[label] = label_count.get(label, 0) + 1
        total_count += 1

        if idx % 10000 == 0:  # 每处理 10000 行，输出进度
            print(f"已处理 {idx} 行...")

    print("\nlabel 分布:")
    for label, count in label_count.items():
        print(f"  {label}: {count}")

    print(f"\n总计: {total_count} 条数据")

def main():
    if len(sys.argv) != 2:
        print("用法: python label_distribution.py <输入文件>", file=sys.stderr)
        sys.exit(1)

    input_file = sys.argv[1]

    print(f"开始统计 {input_file} 中的 label 分布...")
    count_label_distribution(input_file)

if __name__ == "__main__":
    main()

